Date: May 28, 2020
import requests # library to handle requests
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
import random # library for random number generation
# libraries for displaying images
from IPython.display import Image
from IPython.core.display import HTML
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib as mpl
import seaborn as sns
print('Libraries imported.')
trenddata=pd.read_csv("Data11_Age_GetMarried_Trend.csv")
trenddata.set_index('Year', inplace=True)
trenddata.plot(kind='line', figsize=(12, 6)) # pass a tuple (x, y) size)
plt.title('Trend of Average Age at the First Marriage')
plt.ylabel('Average Age')
plt.xlabel('Year')
plt.show()
data=pd.read_csv("Data_Master.csv")
data.head()
# let's examine the types of the column labels
all(isinstance(column, str) for column in data.columns)
# View the information of the dataset
data.info()
# Shape of the data frame
data.shape
# Visualize the histograms of each variables
temp1=data.copy()
temp1.hist(bins=15, color='steelblue', edgecolor='black', linewidth=1.0,
xlabelsize=8, ylabelsize=8, grid=False)
plt.tight_layout(rect=(0, 0, 2, 2))
# Review covariance between attribute (including all data points)
import matplotlib.pyplot as plt
temp2=data.copy()
corr_matrix=temp2.corr()
corr_matrix.style.background_gradient(cmap='coolwarm')
# Based on the abovementioned correlation analysis, we will move forward by dropping 8 variables:
# Area (km2),
# Population (Thousand),
# Immigration Rate,
# Migration Rate,
# Marriage Counts of the first time,
# Average Children, Costs of Living Index, and
# Working Hours per month.
temp3 = temp2.drop(columns=['Migration Rate','Immigration Rate','Population (Thousand)','Area (km2)','Marriage_Counts_1stTime','Average_Children','Cost_Of_Living_Index','Working_Hours/Month'])
temp3.head()
# Scatter Plots (all data points)
cols = temp3.columns
pp = sns.pairplot(temp3[cols], size=1.8, aspect=1.8,
plot_kws=dict(edgecolor="k", linewidth=0.5),
diag_kind="kde", diag_kws=dict(shade=True))
fig = pp.fig
fig.subplots_adjust(top=0.93, wspace=0.3)
t = fig.suptitle('Data Attributes Pairwise Plots', fontsize=14)
# Add a scatter plot showing the relationship between: Net Migration Rate vs Age_First_Marriage
plt.scatter(temp3['Net Migration Rate'],temp3['Age_First_Marriage'],alpha=0.5, s=temp3['Monthly_Income'])
#plt.title('Germany price indices (2000-2018)')
plt.xlabel('Net Migration Rate')
plt.ylabel('Age_First_Marriage')
plt.show()
# Add a scatter plots showing the relationship between: Density (/km2) vs Age_First_Marriage
plt.scatter(temp3['Density (/km2)'],temp3['Age_First_Marriage'],alpha=0.5, s=temp3['Monthly_Income'])
#plt.title('Germany price indices (2000-2018)')
plt.xlabel('Density (/km2)')
plt.ylabel('Age_First_Marriage')
plt.show()
# Add a scatter plots showing the relationship between: Monthly_Income vs Age_First_Marriage
plt.scatter(temp3['Monthly_Income'],temp3['Age_First_Marriage'],alpha=0.5, s=temp3['Monthly_Income'])
#plt.title('Germany price indices (2000-2018)')
plt.xlabel('Monthly_Income')
plt.ylabel('Age_First_Marriage')
plt.show()
# Add a scatter plots showing the relationship between: Divorce_Counts vs Age_First_Marriage
plt.scatter(temp3['Divoirce_Counts'],temp3['Age_First_Marriage'],alpha=0.5, s=temp3['Monthly_Income'])
#plt.title('Germany price indices (2000-2018)')
plt.xlabel('Divoirce_Counts')
plt.ylabel('Age_First_Marriage')
plt.show()
Normalization is a statistical method that helps mathematical-based algorithms to interpret features with different magnitudes and distributions equally. We use StandardScaler() to normalize our dataset.
from sklearn.preprocessing import StandardScaler
temp4=temp3.copy()
X = temp4.values[:,1:]
X = np.nan_to_num(X)
Clus_data = StandardScaler().fit_transform(X)
Clus_data
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.datasets.samples_generator import make_blobs
from sklearn.cluster import KMeans
wcss = []
for i in range(1, 11):
kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=1000, n_init=10, random_state=0)
kmeans.fit(Clus_data)
wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()
clusterNum = 3
k_means = KMeans(init = "k-means++", n_clusters = clusterNum, n_init = 12)
k_means.fit(Clus_data)
labels = k_means.labels_
print(labels)
# We assign the labels to each row in dataframe.
temp4["Cluster"] = labels
temp4.head(5)
# We can easily check the centroid values by averaging the features in each cluster
temp4.groupby('Cluster').mean()
# Now, lets look at the distribution of customers based on Age at the first marriage and Monthly Income
plt.scatter(X[:, 2], X[:, 4], c=labels.astype(np.float), alpha=0.5)
plt.xlabel('Age_First_Marriage', fontsize=18)
plt.ylabel('Monthly_Income', fontsize=16)
plt.show()
# Now, lets look at the distribution of customers based on Age at the first marriage and Density (km2)
plt.scatter(X[:, 2], X[:, 0], c=labels.astype(np.float), alpha=0.5)
plt.xlabel('Age_First_Marriage', fontsize=18)
plt.ylabel('Density (/km2)', fontsize=16)
plt.show()
# Visualize the clusters on map
temp5=temp4.copy()
temp5['Cluster'] = temp5['Cluster'].astype('category')
temp5 = temp5.rename(columns={"Province/City": "province"})
temp5.info()
# Change city name to match json file
temp5['json_name']=temp4['Province/City']
temp5.loc[temp5.province[temp5.province == 'Ho Chi Minh City'].index.tolist(),'json_name']='TP. Ho Chi Minh'
temp5.loc[temp5.province[temp5.province == 'Thua Thien Hue'].index.tolist(),'json_name']='Thua Thien - Hue'
temp5.loc[temp5.province[temp5.province == 'Ba Ria Vung Tau'].index.tolist(),'json_name']='Ba Ria - Vung Tau'
temp5.loc[temp5.province[temp5.province == 'Lang son'].index.tolist(),'json_name']='Lang Son'
temp5.loc[temp5.province[temp5.province == 'Dac Lak'].index.tolist(),'json_name']='Dak Lak'
temp5.head(5)
import seaborn as sns
import folium
import os
import json
import requests
#address= 'Vietnam'
#geolocator = Nominatim()
#location = geolocator.geocode(address)
vn=folium.Map(
location=[13.2904027, 108.4265113],
zoom_start=5)
url = 'https://data.opendevelopmentmekong.net/dataset/999c96d8-fae0-4b82-9a2b-e481f6f50e12/resource/2818c2c5-e9c3-440b-a9b8-3029d7298065/download/diaphantinhenglish.geojson'
vn_geo=json.loads(requests.get(url).text)
vn.choropleth(
geo_data=vn_geo,
data=temp5,
columns=['json_name','Cluster'],
key_on='feature.properties.Name',
fill_color='OrRd',
fill_opacity=0.75,
line_opacity=0.2,
legend_name='Cluster',
threshold_scale=[0, 1, 2, 3])
vn
# Examine the first cluster
temp5[temp5['Cluster'] == 0]
# Examine the second cluster
temp5[temp5['Cluster'] == 1]
# Examine the third cluster
temp5[temp5['Cluster'] == 2]